/*******************************************************************************
* Three-Way Comparison Table: WBES vs Study Sample (≥5) vs Study Sample (<5) vs Micro Survey
* Comparing firms of different sizes across datasets
*******************************************************************************/

clear all
set more off

********************************************************************************
* PART 1: CALCULATE STATISTICS FROM OUR DATASET (FIRMS ≥5)
********************************************************************************

*** Your Dataset - Firms with 5+ employees
use "$data\2_firm_ano_perfvars_individual_paper.dta", clear

gen registered_employees = firmsize * sh_sosec if wave == 1

keep if wave==1   
keep if firmsize >=5 & firmsize <=50 // Keep only firms with 5+ employees

*** Create variables to match WBES/Micro naming
gen annual_sales_th_USD = revenue_wins5_base  
gen micro_firm = sized1
gen small_firm = sized2  
gen medium_firm = sized3
gen sector_ag_manufacturing = broadsecd1
gen sector_construction = broadsecd2
gen sector_wholesale = broadsecd3
gen sector_services = broadsecd4
gen firm_age = firmage 
gen staff_1_3 = empsized1
gen staff_4_6 = empsized2
gen staff_more_6 = empsized3
gen staff_1 = firmsize <2
gen staff_2 = (firmsize >=2 & firmsize <3)
gen staff_3_4 = (firmsize >=3 & firmsize <5)
gen staff_5_10 = (firmsize >=5 & firmsize <=10)
gen staff_11_50 = (firmsize >10 & firmsize <=50)
gen staff_more_50 = firmsize >50


*** Variable list (common across all datasets)
local varlist abidjan annual_sales_th_USD micro_firm small_firm medium_firm  ///
               sector_ag_manufacturing sector_construction  sector_wholesale sector_services  ///
              firm_age firmsize staff_1 staff_2 staff_3_4 staff_5_10 staff_11_50 staff_more_50

*** Calculate and store means for your dataset (≥5 employees)
tempname study_large_means study_large_ns
matrix `study_large_means' = J(1, `: word count `varlist'', .)
matrix `study_large_ns' = J(1, `: word count `varlist'', .)

local i = 1
foreach var of local varlist {
    qui sum `var'
    matrix `study_large_means'[1,`i'] = r(mean)
    matrix `study_large_ns'[1,`i'] = r(N)
    local i = `i' + 1
}

********************************************************************************
* PART 2: CALCULATE STATISTICS FROM OUR DATASET (full sample)
********************************************************************************

*** Your Dataset - Firms with all employees
use "$data\2_firm_ano_perfvars_individual_paper.dta", clear

keep if wave==1   
*keep if firmsize < 5  // Keep only firms with <5 employees


*** Create variables to match WBES/Micro naming
gen annual_sales_th_USD = revenue_wins5_base  
gen micro_firm = sized1
gen small_firm = sized2  
gen medium_firm = sized3
gen sector_ag_manufacturing = broadsecd1
gen sector_construction = broadsecd2
gen sector_wholesale = broadsecd3
gen sector_services = broadsecd4
gen firm_age = firmage 
gen staff_1_3 = empsized1
gen staff_4_6 = empsized2
gen staff_more_6 = empsized3
gen staff_1 = firmsize <2
gen staff_2 = (firmsize >=2 & firmsize <3)
gen staff_3_4 = (firmsize >=3 & firmsize <5)
gen staff_5_10 = (firmsize >=5 & firmsize <=10)
gen staff_11_50 = (firmsize >10 & firmsize <=50)
gen staff_more_50 = firmsize >50

*** Calculate and store means for your dataset (full sample)
tempname study_full_means study_full_ns
matrix `study_full_means' = J(1, `: word count `varlist'', .)
matrix `study_full_ns' = J(1, `: word count `varlist'', .)

local i = 1
foreach var of local varlist {
    qui sum `var'
    matrix `study_full_means'[1,`i'] = r(mean)
    matrix `study_full_ns'[1,`i'] = r(N)
    local i = `i' + 1
}


********************************************************************************
* PART 4: CALCULATE STATISTICS FROM WBES DATASET  
********************************************************************************

use "$data\WBES_data.dta", clear
keep if l1 >= 5
svyset idstd [pweight=wmedian], strata(strata) singleunit(scaled)
local deflator_2022_2020 = 0.944  // Deflate 2022 to 2020 (~3% annual inflation)

* Derived variables
gen abidjan = (stratificationregioncode == 1)
gen annual_sales = d2
gen annual_sales_th_USD = annual_sales / 1000 / 537.286 * `deflator_2022_2020'
gen micro_firm = (annual_sales < 30000000)
gen small_firm = (annual_sales >= 30000000 & annual_sales <= 150000000)
gen medium_firm = (annual_sales > 150000000)
gen firmsize = l1
gen staff_1 = l1 <2
gen staff_2 = (l1 >=2 & l1 <3)
gen staff_3_4 = (l1 >=3 & l1 <5)
gen staff_5_10 = (l1 >=5 & l1 <=10)
gen staff_11_50 = (l1 >10 & l1 <=50)
gen staff_more_50 = l1 >50

gen sector_ag_manufacturing = (d1a1a == 1) if !missing(d1a1a)
gen sector_construction = (d1a1a == 4) if !missing(d1a1a)
gen sector_services = inlist(d1a1a, 6, 51, 52) if !missing(d1a1a)
gen sector_wholesale = inlist(d1a1a, 2, 3) if !missing(d1a1a)  
gen sector_other = 0

gen firm_age = 2023 - b5 if !missing(b5) & b5 > 0

*** Calculate and store means for WBES
tempname wbes_means wbes_ns
matrix `wbes_means' = J(1, `: word count `varlist'', .)
matrix `wbes_ns' = J(1, `: word count `varlist'', .)

local i = 1
foreach var of local varlist {
    capture svy: mean `var'
    if _rc == 0 {
        matrix `wbes_means'[1,`i'] = _b[`var']
        matrix `wbes_ns'[1,`i'] = e(N)
    }
    else {
        qui sum `var'
        matrix `wbes_means'[1,`i'] = r(mean)
        matrix `wbes_ns'[1,`i'] = r(N)
    }
    local i = `i' + 1
}

********************************************************************************
* PART 4b: CALCULATE WBES STATISTICS (Firms with ≤50 employees)
********************************************************************************

use "$data\WBES_data.dta", clear
keep if l1 >= 5 & l1 <= 50  // Filter WBES sample
svyset idstd [pweight=wmedian], strata(strata) singleunit(scaled)

local deflator_2022_2020 = 0.944  // Deflate 2022 to 2020 (~3% annual inflation)


* Derived variables (repeat as before)
gen abidjan = (stratificationregioncode == 1)
gen annual_sales = d2
gen annual_sales_th_USD = annual_sales / 1000 / 537.286 * `deflator_2022_2020'
gen micro_firm = (annual_sales < 30000000)
gen small_firm = (annual_sales >= 30000000 & annual_sales <= 150000000)
gen medium_firm = (annual_sales > 150000000)
gen firmsize = l1
gen staff_1 = l1 <2
gen staff_2 = (l1 >=2 & l1 <3)
gen staff_3_4 = (l1 >=3 & l1 <5)
gen staff_5_10 = (l1 >=5 & l1 <=10)
gen staff_11_50 = (l1 >10 & l1 <=50)
gen staff_more_50 = l1 >50

gen sector_ag_manufacturing = (d1a1a == 1) if !missing(d1a1a)
gen sector_construction = (d1a1a == 4) if !missing(d1a1a)
gen sector_services = inlist(d1a1a, 6, 51, 52) if !missing(d1a1a)
gen sector_wholesale = inlist(d1a1a, 2, 3) if !missing(d1a1a)  
gen sector_other = 0

gen firm_age = 2023 - b5 if !missing(b5) & b5 > 0

*** Calculate and store means for WBES ≤130
tempname wbes130_means wbes130_ns
matrix `wbes130_means' = J(1, `: word count `varlist'', .)
matrix `wbes130_ns' = J(1, `: word count `varlist'', .)

local i = 1
foreach var of local varlist {
    capture svy: mean `var'
    if _rc == 0 {
        matrix `wbes130_means'[1,`i'] = _b[`var']
        matrix `wbes130_ns'[1,`i'] = e(N)
    }
    else {
        qui sum `var'
        matrix `wbes130_means'[1,`i'] = r(mean)
        matrix `wbes130_ns'[1,`i'] = r(N)
    }
    local i = `i' + 1
}


********************************************************************************
* PART 5: CREATE COMPARISON LATEX TABLE
********************************************************************************

file open comptable using "$results\01_tables\Table_S8_comparison_WBES_study_sample.tex", write replace

* Table header
file write comptable "\begin{table}[htbp]" _n
file write comptable "\centering" _n
file write comptable "\scalebox{0.75}{" _n
file write comptable "\caption{Comparison of Summary Statistics Across Datasets}" _n
file write comptable "\label{tab:comparison_four}" _n
file write comptable "\begin{tabular}{lcccccccc}" _n
file write comptable "\toprule" _n
file write comptable " & \multicolumn{2}{c}{WBES (2023)} & \multicolumn{2}{c}{WBES ≤130} & \multicolumn{2}{c}{Study (full)} & \multicolumn{2}{c}{Study (≥5)} \\" _n
file write comptable "Variable & Mean & N & Mean & N & Mean & N & Mean & N \\" _n
file write comptable "\midrule" _n

* Variable labels
local varlabels `" "Abidjan" "Annual Sales (th USD)" "Micro Firm" "Small Firm" "Medium Firm" "Manufacturing \& Agriculture" "Construction" "Wholesale and Retail" "Other Services" "Firm Age" "Firm Size" "Staff 1" "Staff 2" "Staff 3-4" "Staff 5-10" "Staff 11-50" "Staff 51+" "'

* Write each variable row
local i = 1
local j = 1
foreach var of local varlist {
    
    local varlabel : word `j' of `varlabels'
    
    * Get means and N from matrices
    local wbes_mean = `wbes_means'[1,`i']
    local wbes_n = `wbes_ns'[1,`i']
    local study_large_mean = `study_large_means'[1,`i']
    local study_large_n = `study_large_ns'[1,`i']
    local study_full_mean = `study_full_means'[1,`i']
    local study_full_n = `study_full_ns'[1,`i']
	local wbes130_mean = `wbes130_means'[1,`i']
	local wbes130_n = `wbes130_ns'[1,`i']
    
    * Write to table
file write comptable "`varlabel' & " %9.3f (`wbes_mean') " & " %9.0f (`wbes_n') " & " %9.3f (`wbes130_mean') " & " %9.0f (`wbes130_n') " & " %9.3f (`study_full_mean') " & " %9.0f (`study_full_n') " & " %9.3f (`study_large_mean') " & " %9.0f (`study_large_n') " \\" _n    
    local i = `i' + 1
    local j = `j' + 1
}

* Table footer
file write comptable "\bottomrule" _n
file write comptable "\end{tabular}" _n
file write comptable "}" _n  // Close scalebox
file write comptable "\begin{tablenotes}" _n
file write comptable "\footnotesize" _n
file write comptable "\item Notes: This table compares descriptive statistics across four datasets: " _n
file write comptable "the World Bank Enterprise Survey (WBES 2023) for Côte d'Ivoire, the study sample " _n
file write comptable "split by firm size (≥5 employees vs <5 employees), and the World Bank Micro " _n
file write comptable "Enterprise Survey (2009). WBES and Micro Survey statistics calculated using survey weights. " _n
file write comptable "Micro-sized firms: Revenue below 30 mn.\\ FCFA; small-sized firms: revenue " _n
file write comptable "30-150 mn.\\ FCFA; medium-sized firms: revenue above 150 mn.\\ FCFA. " _n
file write comptable "The WBES covers only firms with at least 5 employees, while the Micro Survey " _n
file write comptable "focuses on enterprises with 4 or fewer employees." _n
file write comptable "\end{tablenotes}" _n
file write comptable "\end{table}" _n

file close comptable
********************************************************************************
* DISPLAY RESULTS IN CONSOLE
********************************************************************************
di ""
di "FOUR-WAY COMPARISON: WBES (2023) vs WBES ≤130 vs Study (full) vs Study (≥5 \& <=50)"
di "{hline 140}"
di "Variable" _col(20) "WBES_Mean" _col(32) "WBES_N" _col(42) "WBES130_Mean" _col(54) "WBES130_N" _col(66) "Sam_full_Mean" _col(78) "Sam_full_N" _col(90) "Sam_large_Mean" _col(102) "Sam_large_N"
di "{hline 140}"
local varlabels `" "Abidjan" "Sales(thUSD)" "Micro Firm" "Small Firm" "Medium Firm" "Manufacturing & Agriculture" "Construction" "Wholesale and retrail" "Other Services" "Firm Age" "Firm Size" "Staff 1" "Staff 2" "Staff 3-4"  "Staff 5-10"  "Staff 11-50" "Staff 51+" "'
local i = 1
local j = 1
foreach var of local varlist {
    local varlabel : word `j' of `varlabels'
    local wbes_mean = `wbes_means'[1,`i']
    local wbes_n = `wbes_ns'[1,`i']
    local study_large_mean = `study_large_means'[1,`i']
    local study_large_n = `study_large_ns'[1,`i']
    local study_full_mean = `study_full_means'[1,`i']
    local study_full_n = `study_full_ns'[1,`i']
    local wbes130_mean = `wbes130_means'[1,`i']
    local wbes130_n = `wbes130_ns'[1,`i']
    
    di "`varlabel'" ///
       _col(20) %9.3f `wbes_mean' ///
       _col(32) %7.0f `wbes_n' ///
       _col(42) %9.3f `wbes130_mean' ///
       _col(54) %7.0f `wbes130_n' ///
       _col(66) %9.3f `study_full_mean' ///
       _col(78) %7.0f `study_full_n' ///
       _col(90) %9.3f `study_large_mean' ///
       _col(102) %7.0f `study_large_n'
       
    local i = `i' + 1
    local j = `j' + 1
}
di "{hline 140}"
di ""
di "Four-way comparison table created: comparison_four_datasets.tex"